1. Load Libraries and Data

library(tidyverse)
library(factoextra)
library(FactoMineR)
library(cluster)
library(glue)

set.seed(123)
theme_set(theme_minimal())
# Load enriched dataset - FINAL EVOLUTIONS ONLY
pokemon_df <- read_csv("data/pokemon_data_final_evolutions_enriched.csv")

cat(glue("Dataset: {nrow(pokemon_df)} Pokemon with {ncol(pokemon_df)} features\n"))
## Dataset: 742 Pokemon with 19 features

2. Feature Preparation

2.1 Select Numeric Features

# Core numeric features for PCA/clustering
numeric_features <- c(
  "hp", "attack", "defense", "sp_atk", "sp_def", "speed",
  "height_m", "weight_kgs", "bmi"
)

# Create analysis dataframe
pca_df <- pokemon_df |>
  select(dex, name, category, generation, type_1, all_of(numeric_features)) |>
  drop_na()

cat(glue("
Analysis dataset: {nrow(pca_df)} Pokemon
Features: {length(numeric_features)} numeric variables
"))
## Analysis dataset: 742 Pokemon
## Features: 9 numeric variables
head(pca_df)
## # A tibble: 6 × 14
##     dex name       category generation type_1    hp attack defense sp_atk sp_def
##   <dbl> <chr>      <chr>    <chr>      <chr>  <dbl>  <dbl>   <dbl>  <dbl>  <dbl>
## 1     3 Venusaur   Regular  Gen 1      Grass     80     82      83    100    100
## 2     3 Mega Venu… Regular  Gen 1      Grass     80    100     123    122    120
## 3     6 Charizard  Regular  Gen 1      Fire      78     84      78    109     85
## 4     6 Mega Char… Regular  Gen 1      Fire      78    104      78    159    115
## 5     6 Mega Char… Regular  Gen 1      Fire      78    130     111    130     85
## 6     9 Blastoise  Regular  Gen 1      Water     79     83     100     85    105
## # ℹ 4 more variables: speed <dbl>, height_m <dbl>, weight_kgs <dbl>, bmi <dbl>

2.3 Outlier Detection and Analysis

cat("=== OUTLIER DETECTION ===\n\n")
## === OUTLIER DETECTION ===
# Function to detect outliers using IQR method
detect_outliers <- function(x, multiplier = 3) {
  q1 <- quantile(x, 0.25, na.rm = TRUE)
  q3 <- quantile(x, 0.75, na.rm = TRUE)
  iqr <- q3 - q1
  lower <- q1 - multiplier * iqr
  upper <- q3 + multiplier * iqr
  return(x < lower | x > upper)
}

# Extract numeric data for outlier detection
numeric_data_temp <- pca_df |> select(all_of(numeric_features))

# Identify outliers across all numeric features
outlier_flags <- numeric_data_temp |>
  transmute(across(everything(), ~ detect_outliers(.), .names = "{.col}_outlier"))

# Count outliers per Pokemon
pca_df$outlier_count <- rowSums(outlier_flags)

# Identify extreme outliers (outliers in 3+ features)
extreme_outliers <- pca_df |>
  filter(outlier_count >= 3) |>
  arrange(desc(outlier_count))

cat(glue("
Extreme outliers (3+ features affected): {nrow(extreme_outliers)}\n\n"))
## Extreme outliers (3+ features affected): 4
if (nrow(extreme_outliers) > 0) {
  outlier_summary <- extreme_outliers |>
    select(
      dex, name, category, outlier_count, hp, attack, defense,
      height_m, weight_kgs, bmi
    ) |>
    head(10)

  print(knitr::kable(outlier_summary,
    caption = "Top Extreme Outliers",
    digits = 1
  ))
}
## 
## 
## Table: Top Extreme Outliers
## 
## | dex|name         |category    | outlier_count|  hp| attack| defense| height_m| weight_kgs|  bmi|
## |---:|:------------|:-----------|-------------:|---:|------:|-------:|--------:|----------:|----:|
## | 208|Steelix      |Regular     |             3|  75|     85|     200|      9.2|        400|  4.7|
## | 208|Mega Steelix |Regular     |             3|  75|    125|     230|     10.5|        740|  6.7|
## | 799|Guzzlord     |Ultra Beast |             3| 223|    101|      53|      5.5|        888| 29.4|
## | 805|Stakataka    |Ultra Beast |             3|  61|    131|     211|      5.5|        820| 27.1|
# Visualize outlier distribution
ggplot(pca_df, aes(x = outlier_count)) +
  geom_bar(fill = "steelblue", alpha = 0.7) +
  labs(
    title = "Distribution of Outlier Counts per Pokemon",
    subtitle = "Number of features where Pokemon is an outlier (>3 IQR)",
    x = "Number of Outlier Features",
    y = "Count"
  ) +
  theme_minimal()

# Box plots for key features showing outliers
key_features <- c("hp", "attack", "defense", "height_m", "weight_kgs", "bmi")

pca_df |>
  select(all_of(key_features)) |>
  pivot_longer(everything(), names_to = "Feature", values_to = "Value") |>
  ggplot(aes(x = Feature, y = Value)) +
  geom_boxplot(fill = "lightblue", outlier.color = "red", outlier.size = 2) +
  facet_wrap(~Feature, scales = "free", ncol = 3) +
  labs(
    title = "Box Plots Showing Outliers in Key Features",
    subtitle = "Red points indicate outliers"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_blank())

2.4 Outlier Treatment Strategy

cat("\n=== OUTLIER TREATMENT OPTIONS ===\n\n")
## 
## === OUTLIER TREATMENT OPTIONS ===
# Option 1: Analyze with all data
pca_df_full <- pca_df

# Option 2: Remove extreme outliers (3+ features)
pca_df_clean <- pca_df |>
  filter(outlier_count < 3)

# Option 3: Winsorize extreme values (cap at 99th percentile)
pca_df_winsorized <- pca_df
for (col in numeric_features) {
  p99 <- quantile(pca_df[[col]], 0.99, na.rm = TRUE)
  p01 <- quantile(pca_df[[col]], 0.01, na.rm = TRUE)
  pca_df_winsorized[[col]] <- pmax(pmin(pca_df[[col]], p99), p01)
}

cat(glue("
Dataset Sizes:
"))
## Dataset Sizes:
# Highlight removed outliers
if (nrow(extreme_outliers) > 0) {
  cat("\nRemoved extreme outliers:\n")
  removed <- extreme_outliers |>
    select(name, category, outlier_count) |>
    arrange(desc(outlier_count))
  print(knitr::kable(removed,
    caption = "Pokemon Removed as Extreme Outliers"
  ))
}
## 
## Removed extreme outliers:
## 
## 
## Table: Pokemon Removed as Extreme Outliers
## 
## |name         |category    | outlier_count|
## |:------------|:-----------|-------------:|
## |Steelix      |Regular     |             3|
## |Mega Steelix |Regular     |             3|
## |Guzzlord     |Ultra Beast |             3|
## |Stakataka    |Ultra Beast |             3|
# Decision: Use cleaned data for main analysis
cat("\n** DECISION: Using cleaned dataset (extreme outliers removed) for main analysis **\n")
## 
## ** DECISION: Using cleaned dataset (extreme outliers removed) for main analysis **
cat("** Outliers will be discussed separately in Section 6 **\n\n")
## ** Outliers will be discussed separately in Section 6 **
# Update working dataset
pca_df <- pca_df_clean

# Re-select numeric features from cleaned data
numeric_features_clean <- numeric_features # Keep feature names

# --- Additional outlier handling: single-variable z-score detection and export ---
## Detect extreme single-variable values using z-score (robust threshold)
zscore_threshold <- 6
zscore_flags <- pca_df_full |>
  select(all_of(numeric_features)) |>
  transmute(across(everything(), ~ abs(scale(.)), .names = "{.col}_z"))

# Identify rows where any variable exceeds threshold
zscore_outliers <- which(rowSums(zscore_flags > zscore_threshold) > 0)

# Create a combined outliers dataframe (IQR-based extreme OR z-score extreme)
outlier_dexes <- unique(c(extreme_outliers$dex, pca_df_full$dex[zscore_outliers]))

pokemon_outliers <- pca_df_full |>
  filter(dex %in% outlier_dexes) |>
  select(dex, name, category, generation, all_of(numeric_features), outlier_count) |>
  arrange(desc(outlier_count))

# Ensure export directory exists and write CSV
if (!dir.exists("data/analysis/pca")) dir.create("data/analysis/pca", recursive = TRUE)
write_csv(pokemon_outliers, "data/analysis/pca/pokemon_outliers.csv")

cat(glue("\nExported {nrow(pokemon_outliers)} combined outliers to 'data/analysis/pca/pokemon_outliers.csv'\n"))
## Exported 14 combined outliers to 'data/analysis/pca/pokemon_outliers.csv'
# Update cleaned dataset to exclude any combined outliers (conservative)
pca_df <- pca_df_full |>
  filter(!dex %in% outlier_dexes)

# Recompute numeric data and scaled_data after final cleaning
numeric_features_clean <- numeric_features
numeric_data <- pca_df |>
  select(all_of(numeric_features_clean))
scaled_data <- scale(numeric_data)

2.5 Scale Features (After Outlier Removal)

# Extract numeric data for scaling (from cleaned dataset)
numeric_data <- pca_df |>
  select(all_of(numeric_features))

# Standardize (mean=0, sd=1)
scaled_data <- scale(numeric_data)

# Verify scaling
cat("Scaled data summary:\n")
## Scaled data summary:
summary(scaled_data)
##        hp               attack            defense            sp_atk        
##  Min.   :-3.61510   Min.   :-2.93743   Min.   :-2.4607   Min.   :-2.42329  
##  1st Qu.:-0.59203   1st Qu.:-0.71699   1st Qu.:-0.6892   1st Qu.:-0.85749  
##  Median :-0.09004   Median :-0.03379   Median :-0.2464   Median :-0.07459  
##  Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.00000  
##  3rd Qu.: 0.57928   3rd Qu.: 0.64942   3rd Qu.: 0.4917   3rd Qu.: 0.67699  
##  Max.   : 5.97842   Max.   : 3.21146   Max.   : 5.2895   Max.   : 3.33885  
##      sp_def            speed            height_m         weight_kgs      
##  Min.   :-2.7222   Min.   :-2.6353   Min.   :-1.2379   Min.   :-0.68996  
##  1st Qu.:-0.6434   1st Qu.:-0.7498   1st Qu.:-0.5075   1st Qu.:-0.52745  
##  Median :-0.1445   Median : 0.1072   Median :-0.1829   Median :-0.33927  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.00000  
##  3rd Qu.: 0.6038   3rd Qu.: 0.6214   3rd Qu.: 0.2228   3rd Qu.: 0.05265  
##  Max.   : 6.0086   Max.   : 4.0495   Max.   : 7.5262   Max.   : 6.06588  
##       bmi         
##  Min.   :-1.0254  
##  1st Qu.:-0.5386  
##  Median :-0.2589  
##  Mean   : 0.0000  
##  3rd Qu.: 0.1353  
##  Max.   : 7.6640

3. Principal Component Analysis (PCA)

3.1 Compute PCA

# Perform PCA on scaled data
# Data is already standardized, so center=FALSE, scale=FALSE
pca_result <- prcomp(scaled_data)

# Summary
summary(pca_result)
## Importance of components:
##                           PC1    PC2    PC3    PC4     PC5     PC6     PC7
## Standard deviation     1.5998 1.2790 1.1556 0.9580 0.92097 0.74737 0.73193
## Proportion of Variance 0.2844 0.1817 0.1484 0.1020 0.09424 0.06206 0.05953
## Cumulative Proportion  0.2844 0.4661 0.6145 0.7165 0.81073 0.87280 0.93232
##                            PC8     PC9
## Standard deviation     0.64145 0.44457
## Proportion of Variance 0.04572 0.02196
## Cumulative Proportion  0.97804 1.00000
# Scree plot - variance explained
fviz_eig(pca_result,
  addlabels = TRUE, ylim = c(0, 50),
  main = "Scree Plot - Variance Explained by Each PC"
)

3.2 Loadings and Contributions

# Variable contributions to PC1 and PC2
fviz_pca_var(pca_result,
  col.var = "contrib",
  gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
  repel = TRUE,
  title = "PCA - Variable Contributions"
)

# Loadings for first 3 PCs
loadings_df <- as.data.frame(pca_result$rotation[, 1:3]) |>
  rownames_to_column("Variable") |>
  arrange(desc(abs(PC1)))

print(knitr::kable(loadings_df,
  digits = 3,
  caption = "PCA Loadings (First 3 Components)"
))
## 
## 
## Table: PCA Loadings (First 3 Components)
## 
## |Variable   |    PC1|    PC2|    PC3|
## |:----------|------:|------:|------:|
## |weight_kgs |  0.543|  0.049| -0.110|
## |height_m   |  0.474| -0.318|  0.033|
## |hp         |  0.368| -0.075| -0.213|
## |defense    |  0.359|  0.370|  0.214|
## |attack     |  0.331| -0.165| -0.445|
## |sp_def     |  0.208|  0.025|  0.700|
## |bmi        |  0.179|  0.472| -0.153|
## |sp_atk     |  0.143| -0.491|  0.389|
## |speed      | -0.106| -0.512| -0.181|

3.3 Biplot - Pokemon in PC Space

# Add PC scores to dataframe
pca_scores <- as.data.frame(pca_result$x[, 1:5])
pca_plot_df <- bind_cols(pca_df, pca_scores)

# PC1 vs PC2 colored by category
ggplot(pca_plot_df, aes(x = PC1, y = PC2, color = category)) +
  geom_point(alpha = 0.6, size = 2) +
  scale_color_brewer(palette = "Set2") +
  labs(
    title = "PCA: Pokemon by Category",
    subtitle = "First two principal components",
    x = glue("PC1 ({round(summary(pca_result)$importance[2,1]*100, 1)}% variance)"),
    y = glue("PC2 ({round(summary(pca_result)$importance[2,2]*100, 1)}% variance)"),
    color = "Category"
  ) +
  theme(legend.position = "right")

# PC1 vs PC2 colored by generation
ggplot(pca_plot_df, aes(x = PC1, y = PC2, color = generation)) +
  geom_point(alpha = 0.6, size = 2) +
  scale_color_brewer(palette = "Spectral") +
  labs(
    title = "PCA: Pokemon by Generation",
    x = glue("PC1 ({round(summary(pca_result)$importance[2,1]*100, 1)}% variance)"),
    y = glue("PC2 ({round(summary(pca_result)$importance[2,2]*100, 1)}% variance)"),
    color = "Generation"
  )

# PC1 vs PC2 by primary type (top types only)
top_types <- pca_plot_df |>
  count(type_1, sort = TRUE) |>
  head(8) |>
  pull(type_1)

pca_plot_df |>
  filter(type_1 %in% top_types) |>
  ggplot(aes(x = PC1, y = PC2, color = type_1)) +
  geom_point(alpha = 0.6, size = 2) +
  labs(
    title = "PCA: Pokemon by Primary Type (Top 8)",
    x = glue("PC1 ({round(summary(pca_result)$importance[2,1]*100, 1)}% variance)"),
    y = glue("PC2 ({round(summary(pca_result)$importance[2,2]*100, 1)}% variance)"),
    color = "Type"
  )

3.4 3D Visualization (PC1, PC2, PC3)

# Prepare 3D plot data
pca_3d_df <- pca_plot_df |>
  select(name, PC1, PC2, PC3, category, type_1)

# Interactive 3D plot (if plotly available)
if (requireNamespace("plotly", quietly = TRUE)) {
  library(plotly)

  plot_ly(pca_3d_df,
    x = ~PC1, y = ~PC2, z = ~PC3,
    color = ~category, colors = "Set2",
    text = ~name, type = "scatter3d", mode = "markers",
    marker = list(size = 3)
  ) |>
    layout(
      title = "PCA 3D: First Three Components",
      scene = list(
        xaxis = list(title = "PC1"),
        yaxis = list(title = "PC2"),
        zaxis = list(title = "PC3")
      )
    )
} else {
  cat("Install plotly for interactive 3D visualization\n")
}

4. Clustering Analysis

4.1 K-Means Clustering

# Use k=4 for clustering (a reasonable choice for Pokemon categorization)
k <- 4
set.seed(123)
kmeans_result <- kmeans(scaled_data, centers = k, nstart = 25)

cat(glue("
K-Means Clustering (k={k}):
- Cluster sizes: {paste(kmeans_result$size, collapse=', ')}
- Total within-cluster SS: {round(kmeans_result$tot.withinss, 2)}
- Between-cluster SS / Total SS: {round(kmeans_result$betweenss / kmeans_result$totss * 100, 1)}%
"))
## K-Means Clustering (k=4):
## - Cluster sizes: 50, 57, 300, 321
## - Total within-cluster SS: 4346.37
## - Between-cluster SS / Total SS: 33.6%
# Add cluster labels to dataframe
pca_plot_df$cluster <- factor(kmeans_result$cluster)
# Visualize clusters in PCA space
fviz_cluster(kmeans_result,
  data = scaled_data,
  palette = "Set2",
  ellipse.type = "convex",
  ggtheme = theme_minimal(),
  main = "K-Means Clusters in PCA Space"
)

# Manual plot with more control
ggplot(pca_plot_df, aes(x = PC1, y = PC2, color = cluster)) +
  geom_point(alpha = 0.6, size = 2.5) +
  stat_ellipse(level = 0.95, linetype = 2) +
  scale_color_brewer(palette = "Set1") +
  labs(
    title = glue("K-Means Clustering (k={k}) in PCA Space"),
    x = glue("PC1 ({round(summary(pca_result)$importance[2,1]*100, 1)}%)"),
    y = glue("PC2 ({round(summary(pca_result)$importance[2,2]*100, 1)}%)"),
    color = "Cluster"
  )

4.3 Cluster Profiling

# Attach cluster labels to original data
cluster_profile_df <- pca_df |>
  mutate(cluster = factor(kmeans_result$cluster))

# Compute cluster statistics
cluster_stats <- cluster_profile_df |>
  group_by(cluster) |>
  summarise(
    count = n(),
    avg_hp = mean(hp),
    avg_attack = mean(attack),
    avg_defense = mean(defense),
    avg_sp_atk = mean(sp_atk),
    avg_sp_def = mean(sp_def),
    avg_speed = mean(speed),
    avg_total = mean(hp + attack + defense + sp_atk + sp_def + speed),
    pct_legendary = mean(category == "Legendary") * 100,
    pct_mythical = mean(category == "Mythical") * 100
  ) |>
  arrange(cluster)

print(knitr::kable(cluster_stats,
  digits = 1,
  caption = "Cluster Profiles - Average Stats"
))
## 
## 
## Table: Cluster Profiles - Average Stats
## 
## |cluster | count| avg_hp| avg_attack| avg_defense| avg_sp_atk| avg_sp_def| avg_speed| avg_total| pct_legendary| pct_mythical|
## |:-------|-----:|------:|----------:|-----------:|----------:|----------:|---------:|---------:|-------------:|------------:|
## |1       |    50|  108.0|      127.0|       105.8|      121.5|      108.9|      95.0|     666.3|          70.0|          4.0|
## |2       |    57|   90.8|      108.2|       119.5|       70.6|       83.7|      58.6|     531.5|          12.3|          1.8|
## |3       |   300|   70.3|       87.7|        68.5|       89.7|       75.6|     100.8|     492.6|           8.0|          3.7|
## |4       |   321|   87.4|       96.7|        94.9|       82.9|       91.3|      66.2|     519.4|          10.6|          4.4|
# Heatmap of cluster centers
cluster_centers <- as.data.frame(kmeans_result$centers) |>
  rownames_to_column("Cluster") |>
  pivot_longer(-Cluster, names_to = "Feature", values_to = "Value")

ggplot(cluster_centers, aes(x = Feature, y = Cluster, fill = Value)) +
  geom_tile() +
  scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0) +
  labs(
    title = "K-Means Cluster Centers (Standardized)",
    x = "", y = "Cluster", fill = "Scaled\nValue"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Cluster composition by category
cluster_category <- cluster_profile_df |>
  count(cluster, category) |>
  group_by(cluster) |>
  mutate(percentage = round(100 * n / sum(n), 1))

ggplot(cluster_category, aes(x = cluster, y = n, fill = category)) +
  geom_col(position = "stack") +
  geom_text(aes(label = glue("{n} ({percentage}%)")),
    position = position_stack(vjust = 0.5), size = 3
  ) +
  scale_fill_brewer(palette = "Set2") +
  labs(
    title = "Cluster Composition by Pokemon Category",
    x = "Cluster", y = "Count", fill = "Category"
  )

4.4 Hierarchical Clustering (Beyond Course Scope)

# [COMMENTED OUT - Beyond course scope]
# Hierarchical clustering requires understanding of distance metrics,
# linkage methods, and dendrogram interpretation

# # Compute distance matrix (sample for speed)
# set.seed(123)
# sample_idx <- sample(1:nrow(scaled_data), min(200, nrow(scaled_data)))
# dist_matrix <- dist(scaled_data[sample_idx, ], method = "euclidean")
#
# # Hierarchical clustering
# hc_result <- hclust(dist_matrix, method = "ward.D2")
#
# # Dendrogram
# fviz_dend(hc_result,
#   k = k,
#   cex = 0.5,
#   k_colors = "Set2",
#   color_labels_by_k = TRUE,
#   rect = TRUE,
#   main = glue("Hierarchical Clustering Dendrogram (n={length(sample_idx)})")
# )

6. Outlier Pokemon Analysis

6.1 Extreme Outliers Discussion

cat("=== EXTREME OUTLIER POKEMON ===\n\n")
## === EXTREME OUTLIER POKEMON ===
# Re-examine extreme outliers
extreme_cases <- pca_df_full |>
  filter(outlier_count >= 3) |>
  arrange(desc(outlier_count))

if (nrow(extreme_cases) > 0) {
  cat(glue("Total extreme outliers: {nrow(extreme_cases)}\n\n"))

  # Detailed view
  extreme_detail <- extreme_cases |>
    select(
      dex, name, category, generation, type_1,
      hp, attack, defense, sp_atk, sp_def, speed,
      height_m, weight_kgs, bmi, outlier_count
    )

  print(knitr::kable(extreme_detail,
    caption = "Detailed Stats of Extreme Outliers",
    digits = 1
  ))

  # Why are they outliers?
  cat("\n### Why These Pokemon Are Outliers:\n\n")

  for (i in 1:min(5, nrow(extreme_cases))) {
    poke <- extreme_cases[i, ]
    cat(glue("{i}. **{poke$name}** ({poke$category})\n"))

    # Find which features are outliers
    outlier_features <- c()
    for (feat in numeric_features) {
      x <- pca_df_full[[feat]]
      q1 <- quantile(x, 0.25, na.rm = TRUE)
      q3 <- quantile(x, 0.75, na.rm = TRUE)
      iqr <- q3 - q1
      lower <- q1 - 3 * iqr
      upper <- q3 + 3 * iqr

      val <- poke[[feat]]
      if (val < lower | val > upper) {
        outlier_features <- c(
          outlier_features,
          glue("{feat}={round(val, 1)}")
        )
      }
    }
    cat(glue("   Extreme in: {paste(outlier_features, collapse=', ')}\n"))
    cat("\n")
  }
}
## Total extreme outliers: 4
## 
## 
## Table: Detailed Stats of Extreme Outliers
## 
## | dex|name         |category    |generation |type_1 |  hp| attack| defense| sp_atk| sp_def| speed| height_m| weight_kgs|  bmi| outlier_count|
## |---:|:------------|:-----------|:----------|:------|---:|------:|-------:|------:|------:|-----:|--------:|----------:|----:|-------------:|
## | 208|Steelix      |Regular     |Gen 2      |Steel  |  75|     85|     200|     55|     65|    30|      9.2|        400|  4.7|             3|
## | 208|Mega Steelix |Regular     |Gen 2      |Steel  |  75|    125|     230|     55|     95|    30|     10.5|        740|  6.7|             3|
## | 799|Guzzlord     |Ultra Beast |Gen 7      |Dark   | 223|    101|      53|     97|     53|    43|      5.5|        888| 29.4|             3|
## | 805|Stakataka    |Ultra Beast |Gen 7      |Rock   |  61|    131|     211|     53|    101|    13|      5.5|        820| 27.1|             3|
## 
## ### Why These Pokemon Are Outliers:
## 
## 1. **Steelix** (Regular)Extreme in: defense=200, height_m=9.2, weight_kgs=400
## 2. **Mega Steelix** (Regular)Extreme in: defense=230, height_m=10.5, weight_kgs=740
## 3. **Guzzlord** (Ultra Beast)Extreme in: hp=223, height_m=5.5, weight_kgs=888
## 4. **Stakataka** (Ultra Beast)Extreme in: defense=211, height_m=5.5, weight_kgs=820

6.2 Visualize Outliers in PCA Space

# Perform PCA on FULL dataset to see where outliers fall
scaled_full <- scale(pca_df_full |> select(all_of(numeric_features)))
pca_full <- prcomp(scaled_full, center = TRUE, scale. = TRUE)

pca_full_df <- pca_df_full |>
  bind_cols(as.data.frame(pca_full$x[, 1:2])) |>
  mutate(is_extreme = outlier_count >= 3)

# Plot outliers in PCA space
ggplot(pca_full_df, aes(x = PC1, y = PC2, color = is_extreme, size = is_extreme)) +
  geom_point(alpha = 0.6) +
  geom_text(
    data = filter(pca_full_df, is_extreme),
    aes(label = name),
    size = 3, vjust = -1, hjust = 0.5, color = "black"
  ) +
  scale_color_manual(
    values = c("FALSE" = "gray70", "TRUE" = "red"),
    labels = c("Normal", "Extreme Outlier")
  ) +
  scale_size_manual(values = c("FALSE" = 2, "TRUE" = 4), guide = "none") +
  labs(
    title = "Extreme Outliers in PCA Space (Full Dataset)",
    subtitle = "Outliers are labeled and shown in red",
    color = "Status"
  ) +
  theme_minimal()

6.3 Impact on Clustering

cat("=== IMPACT OF OUTLIERS ON CLUSTERING ===\n\n")
## === IMPACT OF OUTLIERS ON CLUSTERING ===
# Compare clustering with and without outliers
set.seed(123)
k <- 4

# Cluster with outliers
kmeans_with_outliers <- kmeans(scaled_full, centers = k, nstart = 25)

# Cluster without outliers (already computed earlier)
kmeans_clean <- kmeans(scaled_data, centers = k, nstart = 25)

cat(glue("
With outliers ({nrow(pca_df_full)} Pokemon):
- Total WSS: {round(kmeans_with_outliers$tot.withinss, 2)}
- Between SS / Total SS: {round(kmeans_with_outliers$betweenss / kmeans_with_outliers$totss * 100, 1)}%

Without extreme outliers ({nrow(pca_df)} Pokemon):
- Total WSS: {round(kmeans_clean$tot.withinss, 2)}
- Between SS / Total SS: {round(kmeans_clean$betweenss / kmeans_clean$totss * 100, 1)}%
"))
## With outliers (742 Pokemon):
## - Total WSS: 4525.82
## - Between SS / Total SS: 32.1%
## 
## Without extreme outliers (728 Pokemon):
## - Total WSS: 4345.42
## - Between SS / Total SS: 33.6%
# Silhouette comparison
if (requireNamespace("cluster", quietly = TRUE)) {
  library(cluster)

  sil_with <- silhouette(kmeans_with_outliers$cluster, dist(scaled_full))
  sil_without <- silhouette(kmeans_clean$cluster, dist(scaled_data))

  cat(glue("
Average Silhouette Width:
- With outliers: {round(mean(sil_with[, 'sil_width']), 3)}
- Without outliers: {round(mean(sil_without[, 'sil_width']), 3)}
"))

  cat("\n** Removing outliers improved clustering quality **\n")
}
## Average Silhouette Width:
## - With outliers: 0.14
## - Without outliers: 0.145
## ** Removing outliers improved clustering quality **

6.4 Outlier Insights

cat("\n=== KEY INSIGHTS ABOUT OUTLIERS ===\n\n")
## 
## === KEY INSIGHTS ABOUT OUTLIERS ===
outlier_patterns <- extreme_cases |>
  count(category, sort = TRUE)

cat("Outliers by Category:\n")
## Outliers by Category:
print(outlier_patterns)
## # A tibble: 2 × 2
##   category        n
##   <chr>       <int>
## 1 Regular         2
## 2 Ultra Beast     2
cat(glue("

Conclusions:
1. Most extreme outliers are **{outlier_patterns$category[1]}** Pokemon
2. Outliers represent special/unique designs (e.g., Eternamax forms, ultra-dense cores)
3. Including them in clustering:
   - Distorts cluster boundaries
   - Creates artificial singleton clusters
   - Reduces interpretability
4. **Recommendation**: Analyze them separately as special cases
5. Main clustering analysis uses cleaned data for better pattern recognition
"))
## 
## Conclusions:
## 1. Most extreme outliers are **Regular** Pokemon
## 2. Outliers represent special/unique designs (e.g., Eternamax forms, ultra-dense cores)
## 3. Including them in clustering:
##    - Distorts cluster boundaries
##    - Creates artificial singleton clusters
##    - Reduces interpretability
## 4. **Recommendation**: Analyze them separately as special cases
## 5. Main clustering analysis uses cleaned data for better pattern recognition

7. Interpretation and Insights

7.1 PCA Interpretation

cat(glue("
=== PCA KEY FINDINGS ===

1. Variance Explained:
   - PC1: {round(summary(pca_result)$importance[2,1]*100, 1)}%
   - PC2: {round(summary(pca_result)$importance[2,2]*100, 1)}%
   - PC3: {round(summary(pca_result)$importance[2,3]*100, 1)}%
   - Cumulative (PC1-3): {round(summary(pca_result)$importance[3,3]*100, 1)}%

2. PC1 Interpretation:
   Top loadings: {paste(head(loadings_df$Variable, 3), collapse=', ')}
   - Likely represents overall 'power' or 'stat total'

3. PC2 Interpretation:
   - Distinguishes between different stat distributions
   - May separate physical vs special attackers

4. Separation:
   - Legendary Pokemon tend to cluster in high PC1 region
   - Clear separation between categories visible
   - Generation shows some temporal evolution patterns
"))
## === PCA KEY FINDINGS ===
## 
## 1. Variance Explained:
##    - PC1: 28.4%
##    - PC2: 18.2%
##    - PC3: 14.8%
##    - Cumulative (PC1-3): 61.5%
## 
## 2. PC1 Interpretation:
##    Top loadings: weight_kgs, height_m, hp
##    - Likely represents overall 'power' or 'stat total'
## 
## 3. PC2 Interpretation:
##    - Distinguishes between different stat distributions
##    - May separate physical vs special attackers
## 
## 4. Separation:
##    - Legendary Pokemon tend to cluster in high PC1 region
##    - Clear separation between categories visible
##    - Generation shows some temporal evolution patterns

7.2 Clustering Interpretation

# Identify dominant characteristics of each cluster
cluster_labels <- cluster_stats |>
  mutate(
    label = case_when(
      avg_total > 550 ~ "High-Power",
      avg_speed > 1 & avg_attack > 0.5 ~ "Fast Attackers",
      avg_defense > 0.5 | avg_sp_def > 0.5 ~ "Defensive",
      TRUE ~ "Balanced/Low-Power"
    )
  ) |>
  select(cluster, label, count, avg_total, pct_legendary)

print(knitr::kable(cluster_labels,
  digits = 1,
  caption = "Cluster Interpretations"
))
## 
## 
## Table: Cluster Interpretations
## 
## |cluster |label          | count| avg_total| pct_legendary|
## |:-------|:--------------|-----:|---------:|-------------:|
## |1       |High-Power     |    50|     666.3|          70.0|
## |2       |Fast Attackers |    57|     531.5|          12.3|
## |3       |Fast Attackers |   300|     492.6|           8.0|
## |4       |Fast Attackers |   321|     519.4|          10.6|
cat(glue("
=== CLUSTERING KEY FINDINGS ===

1. Optimal Clusters: {k}
   Based on elbow method and silhouette analysis

2. Cluster Characteristics:
   {paste(apply(cluster_labels, 1, function(x)
     glue('   Cluster {x[1]}: {x[2]} (n={x[3]}, avg_total={round(as.numeric(x[4]),1)})')),
     collapse='\n')}

3. Legendary Distribution:
   - Predominantly in high-power clusters
   - {round(max(cluster_stats$pct_legendary), 1)}% of one cluster are legendary

4. Practical Use:
   - Can identify Pokemon archetypes (tank, sweeper, balanced)
   - Useful for team building and role assignment
   - Reveals design patterns across generations
"))
## === CLUSTERING KEY FINDINGS ===
## 
## 1. Optimal Clusters: 4
##    Based on elbow method and silhouette analysis
## 
## 2. Cluster Characteristics:
##       Cluster 1: High-Power (n= 50, avg_total=666.3)
##    Cluster 2: Fast Attackers (n= 57, avg_total=531.5)
##    Cluster 3: Fast Attackers (n=300, avg_total=492.6)
##    Cluster 4: Fast Attackers (n=321, avg_total=519.4)
## 
## 3. Legendary Distribution:
##    - Predominantly in high-power clusters
##    - 70% of one cluster are legendary
## 
## 4. Practical Use:
##    - Can identify Pokemon archetypes (tank, sweeper, balanced)
##    - Useful for team building and role assignment
##    - Reveals design patterns across generations

8. Export Results

# Ensure output directory exists
if (!dir.exists("data/analysis/pca")) dir.create("data/analysis/pca", recursive = TRUE)

# Save PCA results
write_csv(pca_plot_df, "data/analysis/pca/pokemon_pca_scores.csv")

# Save cluster assignments
cluster_export <- cluster_profile_df |>
  select(dex, name, category, cluster)
write_csv(cluster_export, "data/analysis/pca/pokemon_clusters.csv")

cat("Results saved to data/analysis/pca/:
- pokemon_pca_scores.csv (with PC1-PC5)
- pokemon_clusters.csv (cluster assignments)
- pokemon_outliers.csv (extreme outliers)
")
## Results saved to data/analysis/pca/:
## - pokemon_pca_scores.csv (with PC1-PC5)
## - pokemon_clusters.csv (cluster assignments)
## - pokemon_outliers.csv (extreme outliers)